import re

def clean_text(text):
    # 去除HTML标签
    text = re.sub(r'<.*?>', '', text)
    # 移除图片链接
    text = re.sub(r'!\[.*?]\(.*?\)', '', text)
    # 移除特殊符号
    text = re.sub(r'\$.*\$', '', text)
    # 去除特殊字符和标点符号
    # text = re.sub(r'[^\w\s]', '', text)
    # 去除数字
    # text = re.sub(r'\d+', '', text)
    # 去除多余的空格和换行符
    # text = re.sub(r'\s+', ' ', text).strip()
    # 转换为小写
    # text = text.lower()
    # 去除页码
    text = re.sub(r'#\s*第\s*\d+\s*页\s*共\s*\d+\s*页', '', text)
    text = re.sub(r'\s*第\s*\d+\s*页\s*共\s*\d+\s*页', '', text)
    # 去除图表注释
    text = re.sub(r'图 *\d+.*\d* *[a-zA-Z\u4e00-\u9fa5\'\"‘“\u300a(\[{][\u4e00 -\u9fa5]+ *', '', text)
    text = re.sub(r'表 *\d+-*\d* *[a-zA-Z\u4e00-\u9fa5\'\"‘“\u300a(\[{][\u4e00 -\u9fa5]+ *', '', text)
    return text

if __name__ == '__main__':
    # 指定 TXT 文件路径
    txt_path = "tmp_txt/01_md/04_银河麒麟桌面操作系统V10 SP1 2503产品用户手册.md"

    # 指定输出的文件路径
    output_path = "tmp_txt/02_clean/04_银河麒麟桌面操作系统V10 SP1 2503产品用户手册.txt"

    # 清洗文本
    with open(txt_path, "r", encoding="utf-8") as doc:
        text = doc.read()
        cleaned_text = clean_text(text)

    # 将清洗后的文本写入到指定的输出文件中
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(cleaned_text)

    print(f"清洗后的文本已成功保存到 {output_path}")